This document demonstrates the use of the weightedRF and weightedLASSO functions for integrative GRN inference.
It shows how to select one optimal alpha value per gene, in order to apply data integration only to the target genes for which expression prediction error is reduced by using PWM prior information, significantly more than in a baseline model.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5
ALPHAS <- seq(0,1, by = 0.1)
The following steps are time and CPU intensive, so the result files can just be loaded to be analysed in further steps.
Generating 100 repetitions of weightedRF MSE estimation on true data, and on shuffled data.
# lmses <- data.frame(row.names = genes)
# N <-100
# for(alpha in ALPHAS){
# for(perm in 1:N){
# lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
# pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = F)
# }
#
# for(perm in 1:N){
# lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
# pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = T)
# }
# }
#
# save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
# subset<-unique(rownames(lmses))
Generating 100 repetitions of weightedLASSO MSE estimation on true data, and on shuffled data.
# lmses <- data.frame(row.names = genes)
# N<-50
# for(alpha in ALPHAS){
# for(perm in 1:N){
# lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
# pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = F)
#
# lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
# pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = T)
# }
# }
# save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")
nCores = 45
mats <- list()
nrep <- 100
for(alpha in ALPHAS){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_rf <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mat_rf_perm <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha, tf_expression_permutation = TRUE,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mats[[paste0("bRF_", as.character(alpha), '_trueData_', rep)]] <- mat_rf
mats[[paste0("bRF_", as.character(alpha), '_shuffled_', rep)]] <- mat_rf_perm
}
}
save(mats, file = "results/100_permutations_bRF_importances_inf.rdata")
# thresholds the regulatory weights at a certain density to build GRNs
edges <- list()
lmses <- data.frame(row.names = genes)
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){
for(density in densities){# exploring importance threshold stringency
edges[[paste0(name, '_', density)]] <-
weightedRF_network(mats[[name]], density = density, pwm_occurrence, genes, tfs)
lmses[,name] <- mats[[name]]["mse",]
}
}
save(edges, file = "results/100_permutations_bRF_edges_inf.rdata")
save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_rf_validation_dap_inf.rdata")
nCores = 40
# mats <- list()
nrep <- 50
for(alpha in c(1)){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_lasso <- weightedLASSO_inference(counts = counts, genes = genes, tfs = tfs,
alpha = alpha, N = 50,
tf_expression_permutation = FALSE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
alpha = alpha, N = 50,
tf_expression_permutation = TRUE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mats[[paste0("LASSO_", as.character(alpha), '_trueData_', rep)]] <- mat_lasso
mats[[paste0("LASSO_", as.character(alpha), '_shuffled_', rep)]] <- mat_lasso_perm
}
}
save(mats, file = "results/100_permutations_lasso_mda_shuffle.rdata")
# thresholds the regulatory weights at certain densities to build GRNs
edges <- list()
lmses <- data.frame(row.names = genes)
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){
for(density in densities){# exploring importance threshold stringency
edges[[paste0(name, '_', density)]] <-
weightedLASSO_network(mats[[name]], density = density, pwm_occurrence,
genes, tfs, decreasing = TRUE)
lmses[,name] <- mats[[name]]["mse",]
}
}
save(edges, file = "results/100_permutations_lasso_edges.rdata")
save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")
# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_lasso_validation_dap.rdata")
# prior values
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5
# parallel sapply to parallelise the computing of optimal alphas
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {
FUN <- match.fun(FUN)
answer <- parallel::mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!isFALSE(simplify) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
# needs an appropriate mats variable and a lmses variable to be loaded
# plots the importance of TFs that have a specific prior value, here one,
# as alpha is increased
draw_gene_effective_integration <- function(gene, prior=1, type = "rank"){
tfs_with_motif <- names(which(pwm_imputed[gene,]== prior))
if(type == "rank")
data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
if(type == "imp")
data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
data <- data %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_")
plot <- data %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))%>%
ggplot(aes(x=alpha, y = summed_importance, color = dataset, fill = dataset)) +
geom_point(alpha = 0.1) + geom_line(aes(y=mean_imp))+
geom_ribbon(aes(ymin = mean_imp - sd_imp ,
ymax = mean_imp + sd_imp ),
alpha = .4) +theme_pubr(legend = "none")+
ylab("Effective data integration")+
ggtitle("Effective data integration") +
labs(subtitle = gene)+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
plot + xlab(expression(alpha))
}
# plots the MSE as alpha is increased
draw_gene_mse <- function(gene, title = NULL){
data <-
lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("model", "alpha", "dataset", "rep"),
sep = "_")
data %>%
group_by(alpha, dataset) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
ggplot(aes(
x = as.numeric(alpha),
y = value,
color = dataset,
fill = dataset
)) +ggtitle(paste("MSE"))+ylab("MSE/Var(gene)")+
geom_ribbon(aes(ymin = mean_mse - sd_mse ,
ymax = mean_mse + sd_mse ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_line(aes(y=mean_mse))+xlab(expression(alpha))+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
}
# plots the MSE as a function of effective data integration
get_opt_alpha_per_gene <- function(gene, type = "rank",
return_cluster = F, metric = 'div'){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
# gene with no TFBS has an optimal alpha of 0
if(return_cluster & length(tfs_with_motif)==0) return(0)
# else
if(length(tfs_with_motif)>0){
if(type == "rank")
data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
if(type == "imp")
data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
inte <- data %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance, na.rm=T),
sd_imp = sd(summed_importance, na.rm=T),
alpha = as.numeric(alpha))
curves <- lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("model", "alpha", "dataset", "rep"),
sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
mutate(alpha = as.numeric(alpha))%>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(alpha, mean_imp, dataset) %>%
summarise(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T))
curves <- curves %>%
group_by(dataset) %>%
arrange(dataset, mean_imp)%>%
mutate(imps=curves[curves$dataset=="trueData", ]$mean_imp) %>%
mutate(approx_mse = approx(mean_imp,mean_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y,
approx_sd = approx(mean_imp,sd_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y)
true <- curves[curves$dataset=="trueData",]
shuff <- curves[curves$dataset!="trueData",]
# true$div <- (shuff$approx_mse - shuff$approx_sd) - (true$mean_mse)
true$div <- ifelse((shuff$approx_mse - true$mean_mse)/shuff$approx_sd>1,
(shuff$approx_mse - true$mean_mse)/shuff$approx_sd,
0)
if(metric == "div"){
if(max(true$div)>0) alpha_opt <- true[true$div == max(true$div),]$alpha
else alpha_opt <- 0
}
if(metric == "min"){
alpha_opt <- true[true$mean_mse == min(true$mean_mse),]$alpha
}
eff_opt = true[true$alpha==alpha_opt,]$mean_imp
padding = ifelse(type=="rank", ifelse(alpha_opt<0.2, 17, -17), 0)
if(return_cluster) return(alpha_opt)
curves%>%
ggplot(aes(y=mean_mse, x = mean_imp, color = dataset, fill = dataset))+
# geom_ribbon(aes(ymin =mean_mse-sd_mse ,
# ymax = mean_mse + sd_mse), alpha = .4)+
geom_ribbon(aes(x=imps,ymin =approx_mse-approx_sd ,
ymax = approx_mse + approx_sd), alpha = .25)+
geom_point(alpha = 0.1, size = 0.5) +
geom_line(aes(x=mean_imp, y = mean_mse), size=1) +
# geom_line(aes(x=imps, y = approx_mse), col="black")+
theme_pubr(legend = "none") +
ylab("MSE/Var(gene)") + xlab("Effective data integration") + ggtitle("MSE=f(effective integration)")+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
geom_vline(xintercept = eff_opt, size = 2, col="#4670CD") +
annotate("text", x=eff_opt+padding, y=max(shuff$mean_mse),
label=paste("Optimal\nintegration:\nalpha =", alpha_opt),
angle=0, col = "#4670CD", size =3.5 )
}
else ggplot()
}
# importance metrics and MSE for weightedRF
load("results/brf_perumtations_mse_all_genes_predict.rdata")
load("results/100_permutations_bRF_importances_inf.rdata")
gene_no_benefit <- "AT1G30270"
gene_benefit_optimum <- "AT1G14720"
gene_benefit <- "AT5G48970"
n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
draw_gene_mse(gene_no_benefit)+
get_opt_alpha_per_gene(gene_no_benefit)+
plot_annotation(title = gene_no_benefit)
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit_optimum)+
plot_annotation(title = gene_benefit_optimum)
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit)+
plot_annotation(title = gene_benefit)
figure <- n/o/p;figure
ggexport(figure, filename = "results/gene_examples_weightedRF.pdf", width = 10, height = 9)
# value of alpha per genes:
alphas_rf <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank",
return_cluster=T, mc.cores=34)
hist(alphas_rf)
save(alphas_rf, file = "results/alpha_per_gene_weighted_RF.rdata")
alphas_rf <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank", metric = "min",
return_cluster=T, mc.cores=34)
hist(alphas_rf)
save(alphas_rf, file = "results/alpha_per_gene_weighted_RF_min.rdata")
#turns old mse of RF into new format returned by inference
colnames(lmses) <-
paste0("RF", "_", str_split_fixed(colnames(lmses), ' ', 3)[,1], '_',
str_split_fixed(colnames(lmses), ' ', 3)[,3], "_",
str_split_fixed(colnames(lmses), ' ', 3)[,2]) %>%
str_replace("_data", "Data")
save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
Doing the same for weightedLASSO:
# for the lasso
load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/100_permutations_lasso_mda_shuffle.rdata")
# load("results/100_permutations_lasso_edges.rdata")
n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
draw_gene_mse(gene_no_benefit)+
get_opt_alpha_per_gene(gene_no_benefit)+
plot_annotation(title = gene_no_benefit)
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit_optimum)+
plot_annotation(title = gene_benefit_optimum)
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit)+
plot_annotation(title = gene_benefit)
n/o/p
figure <- n/o/p
ggexport(figure, filename = "results/gene_examples_weightedLASSO.pdf", width = 10, height = 9)
# value of alpha per genes:
alphas_lasso <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank",
return_cluster=T, mc.cores=34)
hist(alphas_lasso)
save(alphas_lasso, file = "results/alpha_per_gene_weighted_LASSO_test.rdata")
alphas_lasso <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank",
return_cluster=T, metric = "min", mc.cores=34)
hist(alphas_lasso)
save(alphas_lasso, file = "results/alpha_per_gene_weighted_LASSO_min.rdata")
For divergence from permutation criteria :
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
hists <- data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
gather(key = "model", value = "alpha") %>%
ggplot(aes(x = alpha, fill = model)) +
geom_histogram()+
facet_wrap(~model)+
theme(axis.title.y = element_blank(), legend.position = "none")+
scale_fill_manual( values = c("#C55A11","#E67F87"))+theme_bw()+
theme(strip.background = element_blank(),
legend.position = "none")+
xlab(expression(alpha))+ ylab("Number of genes")
ggexport(hists, filename = "results/alpha_histograms.pdf", height = 3, width = 4.5)
p<-data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
rownames_to_column("gene") %>%
mutate(gene_type=gene %in% tfs) %>%
ggplot(aes(x=weightedLASSO, y=weightedRF)) +
geom_point(alpha = 0)+
geom_bin2d(bins=11)+
scale_fill_gradient(name = "count", trans = "log",
low = "white", high = "#4670CD",
breaks = c(2,10,50,250,700),
labels = c(2,10,50,250,700))+
theme_pubclean() + theme(legend.position = "right",
axis.text = element_text(size = 15),
axis.title = element_text(size = 20))
ggExtra::ggMarginal(p, type = "histogram", fill = "#C55A11", size = 1)
For minimum MSE criteria :
load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")
data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
gather(key = "model", value = "alpha") %>%
ggplot(aes(x = alpha, fill = model)) +
geom_histogram()+
facet_wrap(~model)+
theme(axis.title.y = element_blank(), legend.position = "none")+
scale_fill_manual( values = c("#C55A11","#E67F87"))+theme_pubr()
p<-data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
rownames_to_column("gene") %>%
mutate(gene_type=gene %in% tfs) %>%
ggplot(aes(x=weightedLASSO, y=weightedRF)) +
geom_point(alpha = 0)+
geom_bin2d(bins=11)+
scale_fill_gradient(name = "count", trans = "log",
low = "white", high = "#4670CD",
breaks = c(2,10,50,250,700),
labels = c(2,10,50,250,700))+
theme_pubclean() + theme(legend.position = "right",
axis.text = element_text(size = 15),
axis.title = element_text(size = 20))
ggExtra::ggMarginal(p, type = "histogram", fill = "#C55A11", size = 1)
For the lasso :
load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")
alphas_lasso_min <- alphas_lasso
alphas_rf_min <- alphas_rf
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
alphas_lasso_div <- alphas_lasso
alphas_rf_div <- alphas_rf
saved_lasso <- setdiff(names(alphas_lasso_div[alphas_lasso_div > 0]),
names(alphas_lasso_min[alphas_lasso_min > 0]))
saved_rf <- setdiff(names(alphas_rf_div[alphas_rf_div > 0]),
names(alphas_rf_min[alphas_rf_min > 0]))
same_rf <- intersect(names(alphas_rf_div[alphas_rf_div > 0]),
names(alphas_rf_min[alphas_rf_min > 0]))
lost_lasso <- setdiff(names(alphas_lasso_min[alphas_lasso_min > 0]),
names(alphas_lasso_div[alphas_lasso_div > 0]))
lost_rf <- setdiff(names(alphas_rf_min[alphas_rf_min > 0]),
names(alphas_rf_div[alphas_rf_div > 0]))
same_lasso <- intersect(names(alphas_lasso_min[alphas_lasso_min > 0]),
names(alphas_lasso_div[alphas_lasso_div > 0]))
matrix(c(length(same_lasso), length(lost_lasso), length(saved_lasso),
length(genes) - length(same_lasso) - length(lost_lasso) - length(saved_lasso)),
nrow = 2, byrow = F, dimnames = list(c("data integration div", "no data integration div"),
c("data integration min", "no data integration min")))
## data integration min no data integration min
## data integration div 445 24
## no data integration div 585 372
For the RFs :
matrix(c(length(same_rf), length(lost_rf), length(saved_rf),
length(genes) - length(same_rf) - length(lost_rf) - length(saved_rf)),
nrow = 2, byrow = F, dimnames = list(c("data integration div", "no data integration div"),
c("data integration min", "no data integration min")))
## data integration min no data integration min
## data integration div 351 3
## no data integration div 388 684
–>
–>
–>
Depending on their class (optimal alpha different from 0 or not).
load("results/brf_perumtations_mse_all_genes_predict.rdata")
load(file = "results/alpha_per_gene_weighted_RF.rdata")
pos_class <- names(alphas_rf[alphas_rf!=0])
mse <- lmses[str_detect(colnames(lmses), "trueData")]
for(alpha in seq(0,1, by = 0.1)){
mse[,paste("alpha",alpha)] <- rowMeans(mse[,str_detect(colnames(mse), paste0("RF_",as.character(alpha), "_"))])
}
mse <- as.matrix(mse[str_detect(colnames(mse), "alpha")])
mse_interest <- mse[pos_class,]
mse_other <- mse[setdiff(rownames(mse), pos_class),]
library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), hcl_palette = "Blue-Red 3")
ha = HeatmapAnnotation(
alpha = anno_simple(as.numeric(str_remove(colnames(mse), "alpha "))),
annotation_name_side = "left")
# png("results/rf_mse_interest.png", res = 300, height = 1200, width = 2000)
Heatmap((mse_interest-rowMeans(mse_interest))/matrixStats::rowSds(mse_interest),
col = col_fun, show_column_names = F,
width = ncol(mse_interest)*unit(10, "mm"),
height = nrow(mse_interest)*unit(0.2, "mm"),
cluster_columns = F, show_row_names = F, top_annotation = ha)+
rowAnnotation(class = ifelse(alphas_rf[rownames(mse_interest)]>0,
"class of interest", "no integration"),
col = list(class = setNames(c("#70AD47", "grey"),
nm = c("class of interest", "no integration"))))
# dev.off()
#pdf("results/rf_mse_others.pdf", height = 10)
# png("results/rf_mse_others.png", res = 300, height = 3000, width = 2000)
Heatmap((mse_other-rowMeans(mse_other))/matrixStats::rowSds(mse_other),
col = col_fun,show_column_names = F,
width = ncol(mse_other)*unit(10, "mm"),
height = nrow(mse_other)*unit(0.2, "mm"),
cluster_columns = F, show_row_names = F, top_annotation = ha)+
rowAnnotation(class = ifelse(alphas_rf[rownames(mse_other)]>0,
"class of interest", "no integration"),
col = list(class = setNames(c("#70AD47", "grey"),
nm = c("class of interest", "no integration"))))
# dev.off()
load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
pos_class <- names(alphas_lasso[alphas_lasso!=0])
mse <- lmses[str_detect(colnames(lmses), "trueData")]
for(alpha in seq(0,1, by = 0.1)){
mse[,paste("alpha",alpha)] <- rowMeans(mse[,str_detect(colnames(mse), paste0("LASSO_",as.character(alpha), "_"))])
}
mse <- as.matrix(mse[str_detect(colnames(mse), "alpha")])
mse_interest <- mse[pos_class,]
mse_other <- mse[setdiff(rownames(mse), pos_class),]
library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), hcl_palette = "Blue-Red 3")
ha = HeatmapAnnotation(
alpha = anno_simple(as.numeric(str_remove(colnames(mse), "alpha "))),
annotation_name_side = "left")
#pdf("results/lasso_mse_interest.pdf", height = 7)
# png("results/lasso_mse_interest.png", res = 300, height = 1400, width = 2000)
Heatmap((mse_interest-rowMeans(mse_interest))/matrixStats::rowSds(mse_interest),
col = col_fun, show_column_names = F,
width = ncol(mse_interest)*unit(10, "mm"),
height = nrow(mse_interest)*unit(0.2, "mm"),
cluster_columns = F, show_row_names = F, top_annotation = ha)+
rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_interest)]>0,
"class of interest", "no integration"),
col = list(class = setNames(c("#70AD47", "grey"),
nm = c("class of interest", "no integration"))))
# dev.off()
#pdf("results/lasso_mse_others.pdf", height = 7)
# png("results/lasso_mse_others.png", res = 300, height = 2800, width = 2000)
Heatmap((mse_other-rowMeans(mse_other))/matrixStats::rowSds(mse_other),
col = col_fun,show_column_names = F,
width = ncol(mse_other)*unit(10, "mm"),
height = nrow(mse_other)*unit(0.2, "mm"),
cluster_columns = F, show_row_names = F, top_annotation = ha)+
rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_other)]>0,
"class of interest", "no integration"),
col = list(class = setNames(c("#70AD47", "grey"),
nm = c("class of interest", "no integration"))))
# dev.off()
Divergence from perm :
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
venn <- ggVennDiagram(list("weightedRF" = names(alphas_rf[alphas_rf!=0]),
"weightedLASSO" = names(alphas_lasso[alphas_lasso!=0])), edge_size = 2)+
scale_color_manual( values = c("#C55A11","#E67F87"))+
scale_fill_gradient(high="#e6f2ff", low="#e6f2ff")+
theme(legend.position = "none");venn
# hypergeometric test to assess the significance of the intersect
p_enrich <- phyper(q=220, m = length(names(alphas_rf[alphas_rf!=0])),
n = length(genes) - length(names(alphas_rf[alphas_rf!=0])),
k = length( names(alphas_lasso[alphas_lasso!=0])), lower.tail = F)
p_enrich
## [1] 1.442015e-40
ggexport(venn, filename = "results/classes_intersection.pdf", width = 4, height = 4)
Min MSE :
load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")
venn <- ggVennDiagram(list("weightedRF" = names(alphas_rf[alphas_rf!=0]),
"weightedLASSO" = names(alphas_lasso[alphas_lasso!=0])), edge_size = 2)+
scale_color_manual( values = c("#C55A11","#E67F87"))+
scale_fill_gradient(high="#e6f2ff", low="#e6f2ff")+
theme(legend.position = "none");venn
# hypergeometric test to assess the significance of the intersect
p_enrich <- phyper(q=220, m = length(names(alphas_rf[alphas_rf!=0])),
n = length(genes) - length(names(alphas_rf[alphas_rf!=0])),
k = length( names(alphas_lasso[alphas_lasso!=0])), lower.tail = F)
p_enrich
## [1] 1
ggexport(venn, filename = "results/classes_intersection_min.pdf", width = 4, height = 4)
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
load("rdata/pwm_prom_jaspar_dap.rdata")
pos_class_rf <- names(alphas_rf[alphas_rf!=0])
pos_class_lasso <- names(alphas_lasso[alphas_lasso!=0])
common <- intersect(pos_class_lasso, pos_class_rf)
known_tfs <- tfs[which(tfs %in% pwm_prom$TF)]
get_number_of_motifs_per_tfs <- function(genes){
table(pwm_prom[pwm_prom$target %in% genes & pwm_prom$TF %in% tfs,"TF"])[known_tfs]
}
gene_list <- pos_class_rf
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)
for(tf in known_tfs){
# number of genes with that motif in all genes
n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
# number of genes with that motif in the lasso group
n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
m=n_targets_in_all_tf, #white balls
n=length(genes)-n_targets_in_all_tf, # black balls
k=n_group_lasso, lower.tail = FALSE)
enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")
gene_list <- pos_class_lasso
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)
for(tf in known_tfs){
# number of genes with that motif in all genes
n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
# number of genes with that motif in the lasso group
n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
m=n_targets_in_all_tf, #white balls
n=length(genes)-n_targets_in_all_tf, # black balls
k=n_group_lasso, lower.tail = FALSE)
enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")
gene_list <- common
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)
for(tf in known_tfs){
# number of genes with that motif in all genes
n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
# number of genes with that motif in the lasso group
n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
m=n_targets_in_all_tf, #white balls
n=length(genes)-n_targets_in_all_tf, # black balls
k=n_group_lasso, lower.tail = FALSE)
enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")
draw_validation <- function(validation, densities = c(0.005,0.01,0.05),
precision_only = F, recall_only = F){
data_val <- validation %>%
filter(density %in% densities)%>%
group_by(model, alpha, dataset, density) %>%
mutate(mean_precision = mean(precision, na.rm = T),
sd_precision = sd(precision, na.rm = T),
mean_recall = mean(recall, na.rm = T),
sd_recall = sd(recall, na.rm = T),
density = paste("D =", density),
model = str_replace(model, "bRF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO")) %>%
dplyr::select(-network_name) %>%
mutate(alpha = as.numeric(alpha))
precision <- data_val %>%
ggplot(aes(
x = as.numeric(alpha),
y = precision,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(density), ncol = 8, nest_line = T) +
geom_ribbon(aes(ymin = mean_precision - sd_precision ,
ymax = mean_precision + sd_precision ),
alpha = .4) + theme_pubr(legend = "top")+
geom_point(alpha = 0.1) +
xlab(expression(alpha)) + ylab("Precision") +
ggtitle(paste("Precision against DAP-Seq")) +
geom_line(aes(group = dataset, y = mean_precision)) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
)+scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T)
recall <- data_val %>%
ggplot(aes(
x = as.numeric(alpha),
y = recall,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(density), ncol = 8, nest_line = T, scales = "free") +
geom_ribbon(aes(ymin = mean_recall - sd_recall ,
ymax = mean_recall + sd_recall ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + xlab("alpha") +
xlab(expression(alpha)) + ylab("Recall") +
geom_line(aes(group = dataset, y = mean_recall)) +
ggtitle(paste("Recall against DAPSeq")) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
)+scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
if(precision_only)
return(precision)
if(recall_only)
return(recall)
if(!precision_only & !precision_only)
return(precision/recall)
}
load(file = "results/100_permutations_bRF_validation_dap_inf.rdata")
val_rf <- val_dap
load(file = "results/100_permutations_lasso_validation_dap.rdata")
val_lasso <- val_dap
draw_validation(validation = val_lasso)
draw_validation(validation = val_rf)
# load("results/alpha_per_gene_weighted_LASSO_test.rdata")
# load("results/alpha_per_gene_weighted_RF.rdata")
load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")
hist(alphas_lasso)
hist(alphas_rf)
nCores = 40
mats <- list()
nrep <- 10
for(rep in 1:nrep){ # exploring inherent variability
mat_lasso <- weightedLASSO_inference(counts, genes, tfs,
alpha = alphas_lasso, N = 50,
tf_expression_permutation = FALSE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
alpha = alphas_lasso, N = 50,
tf_expression_permutation = TRUE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mats[[paste0("LASSO_trueData_", rep)]] <- mat_lasso
mats[[paste0("LASSO_shuffled_", rep)]] <- mat_lasso_perm
mat_rf <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alphas_rf,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mat_rf_perm <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alphas_rf,
tf_expression_permutation = TRUE,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mats[[paste0("RF_trueData_", rep)]] <- mat_rf
mats[[paste0("RF_shuffled_", rep)]] <- mat_rf_perm
}
save(mats, file = "results/gene_specific_grns_min.rdata")
edges <- list()
lmses <- data.frame(row.names = genes)
for(name in names(mats)){
for(density in c(0.005,0.01,0.05)){# exploring importance threshold stringency
if(str_detect(name, "LASSO"))
edges[[paste0(name, '_', density)]] <-
weightedLASSO_network(mats[[name]], density = density, pwm_occurrence,
genes, tfs, decreasing = TRUE)
else
edges[[paste0(name, '_', density)]] <-
weightedRF_network(mats[[name]], density = density, pwm_occurrence,
genes, tfs)
lmses[,name] <- mats[[name]]["mse",]
}
}
save(lmses, file = "results/gene_specific_mse_min.rdata")
save(edges, file = "results/gene_specific_edges_min.rdata")
# validation against DAP-Seq
settings <- c("model", "dataset", "rep", "density")
val_specific <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 30,
input_genes = genes,
input_tfs = tfs
)
val_specific[, settings] <-
str_split_fixed(val_specific$network_name, '_', length(settings))
save(val_specific, file = "results/gene_specific_validation_min.rdata")
load("results/gene_specific_grns.rdata")
load("results/gene_specific_mse.rdata")
load("results/gene_specific_edges.rdata")
settings <- c("model", "dataset", "rep", "density")
load("results/gene_specific_validation.rdata")
precision_rf <- draw_validation(validation = val_rf,
densities = c(0.005), precision_only = T)
precision_lasso <- draw_validation(validation = val_lasso,
densities = c(0.005), precision_only = T)
recall_rf <- draw_validation(validation = val_rf,
densities = c(0.005), recall_only = T)
recall_lasso <- draw_validation(validation = val_lasso,
densities = c(0.005), recall_only = T)
spec_rf <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
) + xlab("") + ylim(c(0.2,0.46))+ ylab("")
spec_lasso <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
spec_rf_recall <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = recall, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
) + xlab("") + ylab("")
spec_lasso_recall <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = recall, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylab("")
# gene specific MSE
mse_lasso_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
separate(variable,
into = c("model", "dataset", "rep"),
sep = "_") %>%
filter(model == "LASSO") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylab("MSE")
mse_rf_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
separate(variable,
into = c("model", "dataset", "rep"),
sep = "_") %>%
filter(model == "RF") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylab("MSE")
# MSE for global alphas
draw_mse <- function(model){
data <- lmses[as.numeric(str_split_fixed(colnames(lmses), '_', 4)[,4]) <=10] %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
separate(variable,
into = c("model", "alpha", "dataset", "rep"),
sep = "_") %>%
filter(model == model) %>%
group_by(dataset, rep, alpha) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
group_by(alpha, dataset) %>%
mutate(mean_median_mse = mean(median_MSE),
sd_median_mse = sd(median_MSE)) %>%
mutate(alpha = as.numeric(alpha))
data %>%
ggplot(aes(x=alpha, y = median_MSE, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"),
c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"),
c("shuffled", "trueData"))) +
geom_line(aes(y=mean_median_mse, group = dataset)) +
geom_ribbon(aes(ymin = mean_median_mse - sd_median_mse ,
ymax = mean_median_mse + sd_median_mse),
alpha = .4)+ xlab(expression(alpha)) +
theme_pubr() + geom_point(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
legend.position = 'none'
)+ ylab("MSE")
}
load("results/lasso_perumtations_mse_all_genes_test.rdata")
mse_lasso <- draw_mse("LASSO")
load("results/brf_perumtations_mse_all_genes_predict.rdata")
mse_rf <- draw_mse("RF")
x_min <- 0.19
plot <- mse_lasso + ylim(c(x_min,0.265)) +labs(subtitle = "Global\ndata integration")+
mse_lasso_spec+ ylim(c(x_min,0.265)) +labs(subtitle = "Gene-specific\ndata integration")+ylab("") +
mse_rf + ylim(c(x_min,0.265)) +labs(subtitle = "Global\ndata integration")+ ylab("") +
mse_rf_spec+ ylim(c(x_min,0.265)) +labs(subtitle = "Gene-specific\ndata integration")+ylab("") +
precision_lasso + ylim(c(0.2,0.46)) +ggtitle("")+ labs(subtitle = "Global\ndata integration")+
theme(legend.position = 'none') +
spec_lasso + labs(subtitle = "Gene-specific\ndata integration")+
precision_rf +labs(subtitle = "Global\ndata integration")+ ylab("") +ylim(c(0.2,0.46))+
theme(legend.position = 'none')+ggtitle("") +
spec_rf +labs(subtitle = "Gene-specific\ndata integration")+
recall_lasso + ylim(c(0,0.03)) +ggtitle("")+ labs(subtitle = "Global\ndata integration")+ theme(legend.position = 'none')+
theme(legend.position = 'none') +
spec_lasso_recall + labs(subtitle = "Gene-specific\ndata integration")+ ylim(c(0,0.03))+
recall_rf +labs(subtitle = "Global\ndata integration")+ ylab("")+ ylim(c(0,0.03))+ theme(legend.position = 'none')+
theme(legend.position = 'none')+ggtitle("") +
spec_rf_recall +labs(subtitle = "Gene-specific\ndata integration")+ ylim(c(0,0.03))+
plot_layout(guides = "collect", ncol = 4, widths = c(1.5,1,1.5,1))& theme(legend.position = 'bottom');plot
ggexport(plot, filename = "results/specific_grns.pdf", width = 10, height = 10)
load("results/gene_specific_grns_min.rdata")
load("results/gene_specific_mse_min.rdata")
load("results/gene_specific_edges_min.rdata")
# settings <- c("model", "dataset", "rep", "density")
# val_specific <-
# evaluate_networks(
# edges,
# validation = c("DAPSeq"),
# nCores = 30,
# input_genes = genes,
# input_tfs = tfs
# )
# val_specific[, settings] <-
# str_split_fixed(val_specific$network_name, '_', length(settings))
load("results/gene_specific_validation_min.rdata")
# save(val_specific, file = "results/gene_specific_validation_min.rdata")
spec_rf <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
) + xlab("") + ylim(c(0.2,0.46))+ ylab("")
spec_lasso <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
mse_lasso_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
separate(variable,
into = c("model", "dataset", "rep"),
sep = "_") %>%
filter(model == "LASSO") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylab("")
mse_rf_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
separate(variable,
into = c("model", "dataset", "rep"),
sep = "_") %>%
filter(model == "RF") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylab("")
mse_lasso + ylim(c(0.17,0.265)) +labs(subtitle = "LASSO")+ ggtitle("MSE") +
mse_lasso_spec+ ylim(c(0.17,0.265)) +labs(subtitle = "LASSO")+
mse_rf + ylim(c(0.17,0.265)) +labs(subtitle = "RF") +
mse_rf_spec+ ylim(c(0.17,0.265)) +labs(subtitle = "RF")+
precision_lasso + ylim(c(0.2,0.46)) + ggtitle("Precision") + labs(subtitle = "LASSO")+
theme(legend.position = 'none') +
spec_lasso + labs(subtitle = "LASSO")+
precision_rf + labs(subtitle = "RF") +ylim(c(0.2,0.46))+theme(legend.position = 'none')+ggtitle("") +
spec_rf +labs(subtitle = "RF") +
plot_layout(guides = "collect", ncol = 4, widths = c(1,1,1,1))
Showing the precision of grns with min method and D=0.005
load("results/gene_specific_grns_min.rdata")
edges <- list()
# lmses <- data.frame(row.names = genes)
for(name in names(mats)){
for(density in c(0.005)){# exploring importance threshold stringency
if(str_detect(name, "LASSO"))
edges[[paste0(name, '_', density)]] <-
weightedLASSO_network(mats[[name]][,lost_lasso], density = density, pwm_occurrence,
lost_lasso, tfs, decreasing = TRUE)
else
edges[[paste0(name, '_', density)]] <-
weightedRF_network(mats[[name]][,lost_rf], density = density, pwm_occurrence,
lost_rf, tfs)
# lmses[,name] <- mats[[name]]["mse",]
}
}
settings <- c("model", "dataset", "rep", "density")
val_specific_lost <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 30,
input_genes = genes,
input_tfs = tfs
)
## 5.524 sec elapsed
val_specific_lost[, settings] <-
str_split_fixed(val_specific_lost$network_name, '_', length(settings))
load("results/gene_specific_validation_min.rdata")
spec_rf <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
) + xlab("") + ylim(c(0.2,0.46))+ ylab("")
spec_lasso <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
spec_rf_lost <- val_specific_lost %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
) + xlab("") + ylim(c(0.2,0.46))+ ylab("")
spec_lasso_lost <- val_specific_lost %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
spec_lasso_lost + ggtitle("genes lost lasso") + spec_lasso + ggtitle("all genes lasso") +
spec_rf_lost + ggtitle("genes lost rf") +spec_rf+ ggtitle("all genes rf") + plot_layout(ncol = 4)
# save(val_specific, file = "results/gene_specific_validation_min.rdata")
# save(lmses, file = "results/gene_specific_mse_min.rdata")
# save(edges, file = "results/gene_specific_edges_min.rdata")
Showing the precision of grns with max div method and D=0.005
load("results/gene_specific_grns.rdata")
edges <- list()
# lmses <- data.frame(row.names = genes)
for(name in names(mats)){
for(density in c(0.005)){# exploring importance threshold stringency
if(str_detect(name, "LASSO"))
edges[[paste0(name, '_', density)]] <-
weightedLASSO_network(mats[[name]][,saved_lasso], density = density, pwm_occurrence,
saved_lasso, tfs, decreasing = TRUE)
else
edges[[paste0(name, '_', density)]] <-
weightedRF_network(mats[[name]][,saved_rf], density = density, pwm_occurrence,
saved_rf, tfs)
# lmses[,name] <- mats[[name]]["mse",]
}
}
settings <- c("model", "dataset", "rep", "density")
val_specific_gain <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 30,
input_genes = genes,
input_tfs = tfs
)
val_specific_gain[, settings] <-
str_split_fixed(val_specific_gain$network_name, '_', length(settings))
load("results/gene_specific_validation.rdata")
spec_rf <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0,1))+ ylab("")
spec_lasso <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0,1))+ ylab("")
spec_rf_gain <- val_specific_gain %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0,1))+ ylab("")
spec_lasso_gain <- val_specific_gain %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0,1))+ ylab("")
spec_lasso_gain + ggtitle("saved lasso") + spec_lasso + ggtitle("all genes lasso") +
spec_rf_gain+ ggtitle("saved rf") +spec_rf+ ggtitle("all genes rf") + plot_layout(ncol = 4)
# save(val_specific, file = "results/gene_specific_validation_min.rdata")
# save(lmses, file = "results/gene_specific_mse_min.rdata")
# save(edges, file = "results/gene_specific_edges_min.rdata")
load(file = "results/100_permutations_lasso_edges.rdata")
# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_targ <-
evaluate_networks(
edges,
validation = c("TARGET"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_targ[, settings] <-
str_split_fixed(val_targ$network_name, '_', length(settings))
save(val_targ, file = "results/100_permutations_lasso_validation_target.rdata")
load(file = "results/100_permutations_lasso_validation_target.rdata")
lasso_targ <- draw_validation(validation = val_targ, precision_only = T, densities = 0.005)
load(file = "results/100_permutations_rf_validation_target_inf.rdata")
rf_targ <- draw_validation(validation = val_targ, precision_only = T, densities = 0.005)
load("results/gene_specific_edges.rdata")
settings <- c("model", "dataset", "rep", "density")
val_specific <-
evaluate_networks(
edges,
validation = c("TARGET"),
nCores = 30,
input_genes = genes,
input_tfs = tfs
)
## 13.828 sec elapsed
val_specific[, settings] <-
str_split_fixed(val_specific$network_name, '_', length(settings))
save(val_specific, file = "results/gene_specific_validation_target.rdata")
spec_rf <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "RF") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
) + xlab("") + ylim(c(0.2,0.46))+ ylab("")
spec_lasso <- val_specific %>%
separate(network_name, into = settings, sep = "_") %>%
filter(density == 0.005 & model == "LASSO") %>%
mutate(density = paste("D =", density),
model = str_replace(model, "RF", "weightedRF"),
model = str_replace(model, "LASSO", "weightedLASSO"))%>%
ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) +
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text.x = element_blank(),
legend.position = 'none'
)+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
lasso_targ+ylim(0.1, 0.26) +spec_lasso+ylim(0.1, 0.26)+
rf_targ +ylim(0.1, 0.26) + spec_rf +ylim(0.1, 0.26) +plot_layout(ncol = 4)+plot_annotation("TARGET validation")
Validation of TGA1 targets in inferred GRNs
Differentially expressed targets of TGA1 (35S:TGA1 vs Col-0) identified in whole roots obtained from (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6451032/#MOESM7) (Varala)
load("results/gene_specific_grns.rdata")
load("results/gene_specific_mse.rdata")
load("results/gene_specific_edges.rdata")
tga1_oe <- read.table("data/tga1_overexpression.tsv", h=T, sep = '\t') %>%
mutate(ifelse(is.na(Log2FC), 0, Log2FC)) %>%
mutate(ifelse(is.na(padj), 1, padj)) %>%
mutate(ALog2FC = abs(Log2FC), pval = -log10(padj)) %>%
rename(genes = Target)
tga1 <- "AT5G65210"
draw_oeScore_in_targets <- function(model, density){
nets <- edges[paste0(model, '_trueData_',1:10, '_', density)]
df <- data.frame(tga1_target = logical(0), median_oe = double(0))
for(net in nets){
targs <- net %>% filter(from == tga1)
targs <- targs$to
df <- rbind.data.frame(df, data.frame(genes = unique(net$to),
tga1_target = unique(net$to) %in% targs)%>%
left_join(tga1_oe, by = c("genes"))%>%
group_by(tga1_target) %>%
summarize(median_oe = median(pval, na.rm=T)))
}
df %>%
ggplot(aes(x=tga1_target, y=median_oe, fill = tga1_target))+
geom_boxplot(width = 0.25, outlier.alpha = 0) +
stat_compare_means() + geom_jitter(width = 0.05)+
theme_pubr()
}
draw_oeScore_in_targets("RF", 0.005)+
draw_oeScore_in_targets("LASSO", 0.005)
draw_oeScore_in_targets("RF", 0.01)+
draw_oeScore_in_targets("LASSO", 0.01)
draw_oeScore_in_targets("RF", 0.05)+
draw_oeScore_in_targets("LASSO", 0.05)
draw_importance_vs_oescore <- function(model){
mat_ <- mats[paste0(model, '_trueData_',1:10)]
df <- data.frame(importance = logical(0), ALog2FC = double(0))
for(mat in mat_){
mat <- mat_[[1]]
targs <- net %>% filter(from == tga1)
targs <- targs$to
df <- rbind.data.frame(df,
data.frame(genes = genes,
importance = mat[tga1,genes])%>%
left_join(tga1_oe, by = c("genes")) %>%
select(genes, importance, pval))
}
df %>%
group_by(genes) %>%
summarise(mean_importance = mean(importance, na.rm=T),
pval=mean(pval))%>%
ggplot(aes(x=mean_importance, y=pval))+
geom_point() +
stat_cor() +
theme_pubr() + scale_y_log10() + scale_x_log10()
}
draw_importance_vs_oescore("RF")+
draw_importance_vs_oescore("LASSO")
draw_ALFC_in_targets("RF", 0.01)+
draw_ALFC_in_targets("LASSO", 0.01)
Instead of overexpression in order to remove indirect targets
tga1_tg <- read.table("data/tga1_target.txt")$V1
tga4_tg <- read.table("data/tga4_target.txt")$V1
vrn1_tg <- read.table("data/vrn1_target.txt")$V1
hho3_tg <- read.table("data/hho3_target.txt")$V1
hho5_tg <- read.table("data/hho5_target.txt")$V1
div1_tg <- read.table("data/div1_target.txt")$V1
tga1 <- "AT5G65210"
tga4 <- "AT5G10030"
vrn1 <- "AT3G18990"
hho3 <- "AT1G25550"
hho5 <- "AT4G37180"
div1 <- "AT5G58900"
load("results/gene_specific_edges.rdata")
get_precision <- function(model, density){
df <- data.frame(precision_tga1 = double(0),
precision_tga4 = double(0),
precision_vrn1 = double(0),
precision_hho5 = double(0),
precision_hh3 = double(0),
precision_div1 = double(0),
model = character(0),
density = double(0), dataset = character(0),
alpha = character(0))
for(dataset in c("shuffled", "trueData")){
nets <- edges[paste0(model, '_', dataset ,'_',1:10, '_', density)]
for(net in nets){
df <- rbind.data.frame(df, data.frame(
precision_tga1 =length(intersect( net[net$from == tga1,]$to, tga1_tg))/length( net[net$from == tga1,]$to),
precision_tga4 =length(intersect( net[net$from == tga4,]$to, tga4_tg))/length( net[net$from == tga4,]$to),
precision_vrn1 =length(intersect( net[net$from == vrn1,]$to, vrn1_tg))/length( net[net$from == vrn1,]$to),
precision_hho5 =length(intersect( net[net$from == hho5,]$to, hho5_tg))/length( net[net$from == hho5,]$to),
precision_hho3 =length(intersect( net[net$from == hho3,]$to, hho3_tg))/length( net[net$from == hho3,]$to),
precision_div1 =length(intersect( net[net$from == div1,]$to, div1_tg))/length( net[net$from == div1,]$to),
model = paste0("weighted",model) , density = density,
dataset = dataset, alpha = "gene-specific"))
}
}
return(df)
}
precision_tga1_specific <- rbind.data.frame(get_precision("RF", 0.005),get_precision("LASSO", 0.005))
load(file = "results/100_permutations_bRF_edges_inf.rdata")
get_precision_global_RF <- function(densities, alphas, m){
df <- data.frame(precision_tga1 = double(0),
precision_tga4 = double(0),
precision_vrn1 = double(0),
precision_hho5 = double(0),
precision_hh3 = double(0),
precision_div1 = double(0),
model = character(0),
density = double(0), dataset = character(0),
alpha = double(0))
for(dataset in c("shuffled", "trueData")){
for(alpha in alphas){
for(density in densities){
nets <- edges[paste0('bRF_', alpha, '_', dataset ,'_',1:20, '_', density)]
for(net in nets){
df <- rbind.data.frame(df, data.frame(
precision_tga1 =length(intersect( net[net$from == tga1,]$to, tga1_tg))/length( net[net$from == tga1,]$to),
precision_tga4 =length(intersect( net[net$from == tga4,]$to, tga4_tg))/length( net[net$from == tga4,]$to),
precision_vrn1 =length(intersect( net[net$from == vrn1,]$to, vrn1_tg))/length( net[net$from == vrn1,]$to),
precision_hho5 =length(intersect( net[net$from == hho5,]$to, hho5_tg))/length( net[net$from == hho5,]$to),
precision_hho3 =length(intersect( net[net$from == hho3,]$to, hho3_tg))/length( net[net$from == hho3,]$to),
precision_div1 =length(intersect( net[net$from == div1,]$to, div1_tg))/length( net[net$from == div1,]$to),
model = "weightedRF", density = density,
dataset = dataset, alpha = alpha))
}
}
}
}
return(df)
}
precision_tga1_global_rf <- get_precision_global_RF(0.005, c(0,1))
tga1 <-
load(file = "results/100_permutations_lasso_edges.rdata")
get_precision_global_LASSO <- function(densities, alphas, m){
df <- data.frame(precision_tga1 = double(0),
precision_tga4 = double(0),
precision_vrn1 = double(0),
precision_hho5 = double(0),
precision_hh3 = double(0),
precision_div1 = double(0),
model = character(0),
density = double(0), dataset = character(0),
alpha = double(0))
for(dataset in c("shuffled", "trueData")){
for(alpha in alphas){
for(density in densities){
nets <- edges[paste0('LASSO_', alpha, '_', dataset ,'_',1:20, '_', density)]
for(net in nets){
df <- rbind.data.frame(df, data.frame(
precision_tga1 =length(intersect( net[net$from == tga1,]$to, tga1_tg))/length( net[net$from == tga1,]$to),
precision_tga4 =length(intersect( net[net$from == tga4,]$to, tga4_tg))/length( net[net$from == tga4,]$to),
precision_vrn1 =length(intersect( net[net$from == vrn1,]$to, vrn1_tg))/length( net[net$from == vrn1,]$to),
precision_hho5 =length(intersect( net[net$from == hho5,]$to, hho5_tg))/length( net[net$from == hho5,]$to),
precision_hho3 =length(intersect( net[net$from == hho3,]$to, hho3_tg))/length( net[net$from == hho3,]$to),
precision_div1 =length(intersect( net[net$from == div1,]$to, div1_tg))/length( net[net$from == div1,]$to),
model = "weightedLASSO", density = density,
dataset = dataset, alpha = alpha))
}
}
}
}
return(df)
}
precision_tga1_global_lasso <- get_precision_global_LASSO(0.005, c(0,1))
target_tfs <- rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density)) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_tga1") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("TGA1") +
xlab(expression(alpha)) + theme_pubclean() +
scale_fill_brewer(palette = "Paired")+
theme(strip.background = element_blank(),
legend.position = "none")+
rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density) ) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_tga4") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("TGA4") +
scale_fill_brewer(palette = "Paired")+
xlab(expression(alpha)) + theme_pubclean() +
theme(strip.background = element_blank())+
rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density)) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_vrn1") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("VRN1") +
xlab(expression(alpha)) + theme_pubclean() +
scale_fill_brewer(palette = "Paired")+
theme(strip.background = element_blank(),
legend.position = "none")+
rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density)) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_hho5") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("hho5") +
xlab(expression(alpha)) + theme_pubclean() +
scale_fill_brewer(palette = "Paired")+
theme(strip.background = element_blank(),
legend.position = "none")+
rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density)) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_hho3") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("HHO3") +
scale_fill_brewer(palette = "Paired")+
xlab(expression(alpha)) + theme_pubclean() +
theme(strip.background = element_blank())+
rbind.data.frame(precision_tga1_global_lasso, precision_tga1_global_rf) %>%
rbind.data.frame(precision_tga1_specific) %>%
mutate(alpha = as.factor(alpha),
density = as.factor(density)) %>%
filter(dataset=="trueData") %>%
reshape2::melt() %>%
filter( variable == "precision_div1") %>%
ggplot(aes(x=alpha, y = value)) +
ggh4x::facet_nested_wrap(vars(model), ncol = 2, nest_line = T) +
geom_boxplot(aes(fill = alpha))+
geom_point()+ggtitle("DIV1") +
xlab(expression(alpha)) + theme_pubclean() +
scale_fill_brewer(palette = "Paired")+
theme(strip.background = element_blank(),
legend.position = "none");target_tfs
Shows the top 25 regulators in terms of out-degree.
load("results/gene_specific_edges.rdata")
annot <- DIANE::get_gene_information(genes,organism = "Arabidopsis thaliana")
get_hubs <- function(nets){
g <- setNames(rep(0, length(genes)), genes)
for(net in names(nets)){
g[names(table(nets[[net]]$from))] <- g[names(table(nets[[net]]$from))] + table(nets[[net]]$from)/length(nets)
}
df <- data.frame(genes = genes, average_degree = g[genes],
label = annot[match(genes, rownames(annot)), "label"])
plt <- df %>%
slice_max(average_degree, n = 25) %>%
ggplot(aes(x=average_degree, y=reorder(label, average_degree) ))+
geom_point() + theme_bw() + ylab("gene") + xlab("Average out-degree")
top20<- df %>%
slice_max(average_degree, n = 25) %>%
select(label)
return(list(ranking = plt, top25 = top20$label))
}
rf_nets <- names(edges)[str_detect(names(edges),"RF_trueData") ]
rf_nets <- edges[rf_nets[str_detect(rf_nets,"0.005") ]]
lasso_nets <- names(edges)[str_detect(names(edges),"LASSO_trueData") ]
lasso_nets <- edges[lasso_nets[str_detect(lasso_nets,"0.005") ]]
get_hubs(rf_nets)$ranking+ ggtitle("weightedRF") + get_hubs(lasso_nets)$ranking+ggtitle("weightedLASSO")
intersect(get_hubs(rf_nets)$top25 , get_hubs(lasso_nets)$top25)
## [1] "ASR3" "HHO2" "TGA1" "TGA4" "ABO3" "HHO3"
## [7] "DIV1" "AT3G11280"
Rankings in global alpha networks compared for alpha specific in RF:
load(file = "results/100_permutations_bRF_edges_inf.rdata")
rf_nets_0 <- names(edges)[str_detect(names(edges),"RF_0_trueData") ]
rf_nets_0 <- edges[rf_nets_0[str_detect(rf_nets_0,"0.005") ]]
rf_nets_1 <- names(edges)[str_detect(names(edges),"RF_1_trueData") ]
rf_nets_1 <- edges[rf_nets_1[str_detect(rf_nets_1,"0.005") ]]
get_hubs(rf_nets_0)$ranking+ ggtitle("weightedRF alpha = 0") +
get_hubs(rf_nets_1)$ranking+ggtitle("weightedRF alpha = 1") +
get_hubs(rf_nets)$ranking+ ggtitle("weightedRF specific alpha")
intersect(get_hubs(rf_nets)$top25 , get_hubs(rf_nets_1)$top25)
## [1] "ASR3" "HHO2" "WRKY18" "TGA1" "AT1G12630" "TGA4"
## [7] "BHLH34" "ABO3" "BZIP3" "HHO3" "DIV1" "AT3G11280"
## [13] "HRS1"
intersect(get_hubs(rf_nets)$top25 , get_hubs(rf_nets_0)$top25)
## [1] "ASR3" "HHO2" "WRKY18" "TGA1" "AT1G12630" "TGA4"
## [7] "BHLH34" "ABO3" "CDF1" "BT2" "LBD37" "LBD39"
## [13] "NLP4" "CRF11" "BZIP3" "COL5" "RR3" "LBD38"
## [19] "TLP9"
# regulators that were not in the top 25
setdiff(get_hubs(rf_nets)$top25 , union(get_hubs(rf_nets_0)$top25,
get_hubs(rf_nets_1)$top25))
## [1] "BZIP53" "HHO6"
Rankings in global alpha networks compared for alpha specific in LASSO:
load(file = "results/100_permutations_lasso_edges.rdata")
lasso_nets_0 <- names(edges)[str_detect(names(edges),"LASSO_0_trueData") ]
lasso_nets_0 <- edges[lasso_nets_0[str_detect(lasso_nets_0,"0.005") ]]
lasso_nets_1 <- names(edges)[str_detect(names(edges),"LASSO_1_trueData") ]
lasso_nets_1 <- edges[lasso_nets_1[str_detect(lasso_nets_1,"0.005") ]]
get_hubs(lasso_nets_0)$ranking+ ggtitle("weightedLASSO alpha = 0") +
get_hubs(lasso_nets_1)$ranking+ggtitle("weightedLASSO alpha = 1") +
get_hubs(lasso_nets)$ranking+ ggtitle("weightedLASSO specific alpha")
intersect(get_hubs(lasso_nets)$top25 , get_hubs(lasso_nets_1)$top25)
## [1] "HB6" "JKD" "VRN1" "UIF1" "AT3G11280" "TGA1"
## [7] "AT2G40260" "DIV1" "HHO3" "ID" "HHO2" "HAT1"
## [13] "DOF2.4" "ABO3" "AT5G56840" "BZIP2" "EPR1" "ASR3"
## [19] "CRF4"
intersect(get_hubs(lasso_nets)$top25 , get_hubs(lasso_nets_0)$top25)
## [1] "HB6" "JKD" "AT3G11280" "TGA1" "CIL2" "WRKY2"
## [7] "ABO3" "KNAT3" "AT1G61730" "BT5"
# regulators that were not in the top 25
setdiff(get_hubs(lasso_nets)$top25 , union(get_hubs(lasso_nets_0)$top25,
get_hubs(lasso_nets_1)$top25))
## [1] "TGA4"
Getting the most regulated genes in specific GRNs :
get_hubs_incoming <- function(nets){
g <- setNames(rep(0, length(genes)), genes)
for(net in names(nets)){
g[names(table(nets[[net]]$to))] <- g[names(table(nets[[net]]$to))] + table(nets[[net]]$to)/length(nets)
}
df <- data.frame(genes = genes, average_degree = g[genes],
label = annot[match(genes, rownames(annot)), "label"])
plt <- df %>%
slice_max(average_degree, n = 25) %>%
ggplot(aes(x=average_degree, y=reorder(label, average_degree) ))+
geom_point() + theme_bw() + ylab("gene") + xlab("Average in-degree")
top20<- df %>%
slice_max(average_degree, n = 25) %>%
select(label)
return(list(ranking = plt, top20 = top20$label))
}
get_hubs_incoming(rf_nets)$ranking+ ggtitle("weightedRF") +
get_hubs_incoming(lasso_nets)$ranking+ ggtitle("weightedLASSO")
intersect(get_hubs_incoming(rf_nets)$top25 , get_hubs_incoming(lasso_nets)$top25)
## NULL
# library(clusterProfiler)
#
# N_go <- c("GO:0006807", "GO:0051171", "GO:0051173", "GO:0022622", "GO:0034641", "GO:0044271")
#
# convert_from_agi <- function(ids){
# x <- org.At.tair.db::org.At.tairENTREZID
# mapped_genes <- AnnotationDbi::mappedkeys(x)
# xx <- as.list(x[mapped_genes])
# return(unlist(xx[as.vector(ids)]))
# }
#
# nets <-rf_nets
#
# g <- setNames(rep(0, length(genes)), genes)
# for(net in names(nets)){
# g[names(table(nets[[net]]$to))] <- g[names(table(nets[[net]]$to))] + table(nets[[net]]$to)/length(nets)
# }
# df <- data.frame(genes = genes, average_degree = g[genes],
# label = annot[match(genes, rownames(annot)), "label"])
#
#
# degrees <- sort(setNames(df$average_degree,convert_from_agi(df$average_degree)),
# decreasing = T)
#
# gse <- gseGO(geneList=degrees,
# ont ="BP",
# minGSSize = 3,
# maxGSSize = 800, nPerm =1000,
# pvalueCutoff = 1,
# verbose = TRUE, scoreType = "pos",
# OrgDb = org.At.tair.db::org.At.tair.db,
# pAdjustMethod = "BH")
#
# res <- gse@result[gse@result$ID %in% N_go,c("Description", "enrichmentScore", "p.adjust")]
# res_rf <- res
#
# gs <- rbind.data.frame(res_rf_0, res_rf_1, res_rf)
# gs$alpha<- c(rep("alpha_0",4), rep("alpha_1", 4), rep("alpha_specific",4))
# gs$model <- "weightedRF"
#
# gs_lasso <- rbind.data.frame(res_lasso_0, res_lasso_1, res_lasso)
# gs_lasso$alpha <- c(rep("alpha_0",4), rep("alpha_1", 4), rep("alpha_specific",4))
# gs_lasso$model <- "weightedLASSO"
#
# gsea_nitrogen <- rbind.data.frame(gs, gs_lasso)
# save(gsea_nitrogen, file = "results/gsea_nitrogen.rdata")
load("results/gsea_nitrogen.rdata")
gsea_nitrogen %>%
ggplot(aes(x=alpha, y = enrichmentScore, color = Description)) +
ggh4x::facet_nested_wrap(vars(Description,model), nrow = 2)+ theme_bw()+
theme(strip.background = element_blank())+
geom_point() +ggtitle("Enrichment score (GSEA)")
rbind.data.frame(gs, gs_lasso) %>%
ggplot(aes(x=alpha, y = -log(p.adjust), color = Description)) +
ggh4x::facet_nested_wrap(vars(Description,model), nrow = 2)+ theme_bw()+
theme(strip.background = element_blank())+
geom_jitter() + ggtitle("Enrichment p adjusted pavlue (GSEA)")
load("rdata/pwm_prom_jaspar_dap.rdata")
known_tfs <- intersect(genes,unique(pwm_prom$TF))
get_degree_hist <- function(nets, title){
g <- setNames(rep(0, length(genes)), genes)
for(net in names(nets)){
g[names(table(nets[[net]]$to))] <- g[names(table(nets[[net]]$to))] + table(nets[[net]]$to)/length(nets)
}
df <- data.frame(genes = genes, average_degree = g[genes],
label = annot[match(genes, rownames(annot)), "label"])%>%
mutate(pwm = ifelse(genes %in% known_tfs, "Available", "Missing"))
df %>%
filter(genes %in% tfs) %>%
ggplot(aes(x=average_degree, fill = pwm)) +
geom_histogram()+facet_wrap(~ pwm)+ theme_bw()+
ggtitle(title) + theme(legend.position = "none")
}
(get_degree_hist(rf_nets_0, "weightedRF alpha 0")+
get_degree_hist(rf_nets_1, "weightedRF alpha 1")+
get_degree_hist(rf_nets, "weightedRF alpha specific"))/
(get_degree_hist(lasso_nets_0, "weightedLASSO alpha 0")+
get_degree_hist(lasso_nets_1, "weightedLASSO alpha 1")+
get_degree_hist(lasso_nets, "weightedLASSO alpha specific"))
# TGA4
DIANE::get_gene_information(net_rf[net_rf$from == "AT5G10030",]$to, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(net_lasso[net_lasso$from == "AT5G10030",]$to, organism = "Arabidopsis thaliana")
#TGA1
DIANE::get_gene_information(net_rf[net_rf$from == "AT5G65210",]$to, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(net_lasso[net_lasso$from == "AT5G65210",]$to, organism = "Arabidopsis thaliana")
# HH02
DIANE::get_gene_information(net_rf[net_rf$from == "AT1G68670",]$to, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(net_lasso[net_lasso$from == "AT1G68670",]$to, organism = "Arabidopsis thaliana")
get_in_hubs <- function(net){
hubs <- names(table(net$to)[order(table(net$to), decreasing = T)])[1:25]
DIANE::get_gene_information(hubs, organism = "Arabidopsis thaliana")
}
get_in_hubs(net_lasso)
get_in_hubs(net_rf)
DIANE::get_gene_information(net_rf[net_rf$to == "AT1G08090",]$from, organism = "Arabidopsis thaliana")
load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
load("rdata/pwm_prom_jaspar_dap.rdata")
load("rdata/gene_structure.rdata")
load("results/gene_specific_grns.rdata")
load("results/gene_specific_mse.rdata")
load("results/gene_specific_edges.rdata")
mean_expr <- rowMeans(counts)[genes]
var_expr <- matrixStats::rowSds(counts[genes,])*matrixStats::rowSds(counts[genes,])
pwm_prom_n_TFs <- pwm_prom[pwm_prom$TF %in% tfs,]
# to comment for new version where mse is already normalized per genes
# norm_mse <- exp(as.matrix(cbind(lmses, lmses_lasso)))/var_expr
genes_info <- data.frame(genes = genes,
cluster_rf = alphas_rf,
cluster_lasso = alphas_lasso)
genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$nb_motifs <- table(pwm_prom$target)[genes]
genes_info$nb_motifs_n_tfs <- table(pwm_prom_n_TFs$target)[genes]
genes_info$common <- genes %in% common
genes_info[,c("n_introns", "n_transcripts")] <-
gene_structure[match(genes_info$gene, gene_structure$gene),
c("n_introns", "n_transcripts")]
# for rf
genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=log(n_introns))) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of introns for RF groups")) +
genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=n_transcripts)) +
geom_boxplot(width=0.1, fill = "white")+
geom_violin(fill="darkblue", alpha=0.2)+
ggtitle(("Number of transcripts for RF groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=nb_motifs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs in promoter for RF groups")) +
genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=nb_motifs_n_tfs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs of N-responsive TFs in promoter for RF groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=var)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
stat_compare_means()+ genes_info%>%
ggplot(aes(x=factor(cluster_rf), y=expr)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
stat_compare_means()
genes_info %>%
group_by(cluster_rf) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n()) %>%
ggplot(aes(x=cluster_rf, y=tf_frac,
label = paste("n=",n))) +
geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.2) + xlab("cluster RF") +
ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))
###########" for lasso
genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=log(n_introns))) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of introns for LASSO groups")) +
genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=n_transcripts)) +
geom_boxplot(width=0.1, fill = "white")+
geom_violin(fill="darkblue", alpha=0.2)+
ggtitle(("Number of transcripts for LASSO groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=nb_motifs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs in promoter for LASSO groups")) +
genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=nb_motifs_n_tfs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs of N-responsive TFs in promoter for LASSO groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=var)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
stat_compare_means()+ genes_info%>%
ggplot(aes(x=factor(cluster_lasso), y=expr)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
stat_compare_means()
genes_info %>%
group_by(cluster_lasso) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n()) %>%
ggplot(aes(x=cluster_lasso, y=tf_frac,
label = paste("n=",n))) +
geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.2) + xlab("cluster RF") +
ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))
######## common classes of interest as compared to the rest
length(intersect(common, tfs))/length(common)
## [1] 0.2045455
genes_info%>%
ggplot(aes(x=common, y=log(n_introns))) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of introns for common genes of interest")) +
genes_info%>%
ggplot(aes(x=common, y=n_transcripts)) +
geom_boxplot(width=0.1, fill = "white")+
geom_violin(fill="darkblue", alpha=0.2)+
ggtitle(("Number of transcripts for common groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=common, y=nb_motifs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs in promoter for common groups")) +
genes_info%>%
ggplot(aes(x=common, y=nb_motifs_n_tfs)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Number of motifs of N-responsive TFs in promoter for common groups")) +
stat_compare_means()
genes_info%>%
ggplot(aes(x=common, y=var)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene variance for common groups")) + scale_y_log10()+
stat_compare_means()+ genes_info%>%
ggplot(aes(x=common, y=expr)) +
geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene expression for common groups")) + scale_y_log10()+
stat_compare_means()
genes_info %>%
group_by(common) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n()) %>%
ggplot(aes(x=common, y=tf_frac,
label = paste("n=",n))) +
geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.2) + xlab("cluster common") +
ggtitle("Fraction of TFs in common groups")+ylim(c(0,0.22))
phyper(q=length(intersect(common, tfs)),
m = length(tfs),
n = length(genes) - length(tfs),
k = length(common), lower.tail = F)
## [1] 0.004688486